Tokenize the SOTU speech collections using the NLTK package and then write a program that compute the frequency of words and POS. Compare the speeches based on the following properties:
In [1]:
import csv, re
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from itertools import groupby
from math import log
In [2]:
def pos_stats(filename):
pattern = re.compile("[A-Z]{1}")
sum_stats= dict();
with open ('stopwords.txt') as stop:
stop_words = [w.strip().lower() for w in stop]
with open(filename) as text:
text_string = text.read()
text_words = word_tokenize(text_string)
text_words.sort()
# output: (word, frequency)
# Word frequency count with upper case words
word_freq_cap = [(key, len(list(group))) for key, group in groupby(text_words, lambda x: x)]
target = open('POS output/' + filename + '_withCap.txt', 'w')
target.write(str(word_freq_cap))
# Word frequency count without upper case words
word_freq = [(key, len(list(group))) for key, group in groupby(text_words, lambda x: x.lower())]
target = open('POS output/' + filename + '_noCap.txt', 'w')
target.write(str(word_freq))
# filter out stop words
word_filtered = [f for f in word_freq if f[0] not in stop_words]
# vocabulary size with and without stop words
sum_stats["vocabulary size"] = [len(word_freq), len(word_filtered)]
# stopword frequency
stop_freq = 0;
# total word frequency
total_freq = 0;
for w, frq in word_freq:
if w in stop_words:
stop_freq += frq
total_freq += frq
sum_stats["stop word frequency"] = stop_freq/float(total_freq)
# number of capital letters
# average number of character per word
total_chars = 0;
total_words = 0;
total_capital = 0;
for word, freq in word_freq_cap:
total_chars += len(word)*freq
total_words += freq
total_capital += freq * len(pattern.findall(word))
if total_chars != 0 and total_words != 0:
sum_stats["avg chars per word"] = total_chars/float(total_words)
sum_stats["number of capital letters"] = total_capital
# POS tagging, without filtering stop words
pos_info = None
pos_info = pos_tag(word_tokenize(text_string.lower()))
target = open('POS output/' + filename + 'pos_noCap.txt', 'w')
target.write(str(pos_info))
pos_counts = dict();
cross_walk = {'NN':'noun', 'JJ':'adjectives',
'VB':'verbs', 'RB':'adverbs', 'PRP':'pronouns'}
for word, pos in pos_info:
pos_type = cross_walk.get(pos);
if pos:
pos_counts[pos_type] = pos_counts.get(pos_type, 0) + 1
else:
None
sum_stats["POS counts"] = pos_counts
pos_info_sort = sorted (pos_info, key= lambda x: (x[1], x[0]))
pos_word_counts = [(k, len(list(group))) for k, group in groupby(pos_info_sort, lambda x: x )]
# desired output: ((word, pos) counts)
# but the problem is that pos does not combine
tmp1 = sorted(pos_word_counts, key = lambda x: x[0][1])
tmp2 = [(k, sorted(group, key = lambda x: x[1], reverse=True)) for k, group in groupby(tmp1, lambda x: x[0][1])]
pos_list , word_count = zip(*tmp2)
topwords_by_pos = dict()
for pos_type in pos_list:
ts1 = word_count[pos_list.index(pos_type)]
cnt = 0
ts2 = list();
for word_tup in ts1:
if not (cnt < 10):
break
ts2.append((word_tup[0][0], word_tup[1]))
cnt += 1
topwords_by_pos[cross_walk.get(pos_type)] = ts2
sum_stats["top words by pos"] = topwords_by_pos
# output: (frequency, # of unique word)
word_sorted = sorted(word_filtered, key = lambda x: x[1])
cnt_word = [(key, len(list(group))) for key, group in groupby(word_sorted, lambda x: x[1])]
log_word = [(log(x), log(y)) for x, y in cnt_word]
target = open('POS output/' + filename + 'summary.txt', 'w')
target.write(str(sum_stats))
return sum_stats
In [3]:
pos_stats('sotu1790-2015.txt')
Out[3]:
In [4]:
pos_stats('1GW.txt')
Out[4]:
In [5]:
pos_stats('2JA.txt')
Out[5]:
In [6]:
pos_stats('3TJ.txt')
Out[6]:
In [7]:
pos_stats('4JM.txt')
Out[7]:
In [8]:
pos_stats('5JM.txt')
Out[8]:
In [9]:
pos_stats('6JQA.txt')
Out[9]:
In [10]:
pos_stats('7AJ.txt')
Out[10]:
In [11]:
pos_stats('8MVB.txt')
Out[11]:
In [12]:
pos_stats('10JT.txt')
Out[12]:
In [13]:
pos_stats('11JP.txt')
Out[13]:
In [14]:
pos_stats('12ZT.txt')
Out[14]:
In [15]:
pos_stats('13MF.txt')
Out[15]:
In [16]:
pos_stats('14FP.txt')
Out[16]:
In [17]:
pos_stats('15JB.txt')
Out[17]:
In [18]:
pos_stats('16AL.txt')
Out[18]:
In [19]:
pos_stats('17AJ.txt')
Out[19]:
In [20]:
pos_stats('18USG.txt')
Out[20]:
In [21]:
pos_stats('19RBH.txt')
Out[21]:
In [22]:
pos_stats('21CA.txt')
Out[22]:
In [23]:
pos_stats('22GC.txt')
Out[23]:
In [24]:
pos_stats('23BH.txt')
Out[24]:
In [25]:
pos_stats('25WM.txt')
Out[25]:
In [26]:
pos_stats('26TR.txt')
Out[26]:
In [27]:
pos_stats('27WT.txt')
Out[27]:
In [28]:
pos_stats('28WW.txt')
Out[28]:
In [29]:
pos_stats('29WH.txt')
Out[29]:
In [30]:
pos_stats('30CC.txt')
Out[30]:
In [31]:
pos_stats('31HH.txt')
Out[31]:
In [32]:
pos_stats('32FDR.txt')
Out[32]:
In [33]:
pos_stats('33HT.txt')
Out[33]:
In [34]:
pos_stats('34DE.txt')
Out[34]:
In [35]:
pos_stats('35JFK.txt')
Out[35]:
In [36]:
pos_stats('36LBJ.txt')
Out[36]:
In [37]:
pos_stats('37RN.txt')
Out[37]:
In [38]:
pos_stats('38GRF.txt')
Out[38]:
In [39]:
pos_stats('39JC.txt')
Out[39]:
In [40]:
pos_stats('40RR.txt')
Out[40]:
In [41]:
pos_stats('41GHB.txt')
Out[41]:
In [42]:
pos_stats('42WC.txt')
Out[42]:
In [43]:
pos_stats('43GWB.txt')
Out[43]:
In [44]:
pos_stats('44OB.txt')
Out[44]:
In [ ]:
In [ ]: